** Data reading and variable selection from raw data
** 1984-2015 GERMAN SOCIO-ECONOMIC PANEL STUDY


** 01. Reading data **

cap log close
clear all
set more off
cd /*insert you work directory here*/


***Required variables are pread between several files. Therefore, the following steps involve reading several datafiles. We have indicated which data file each stage corresponds to***

*****************

*TAKE VARIABLES FROM BIO PARENT FILE*

use /*read your data here*/  

keep persnr hhnr bioyear nums numb geschw vsbil msbil visco88 misco88 visei misei vbbil mbbil

**PID**

rename persnr pid

**HOUSEHOLD NUMBER**

rename hhnr hid

lab var hid "Household Identifier"

**BIOYEAR**

lab var bioyear "Year bio information was collected"

**PARENTAL EDUCATION**

*Rename parental education variables*

rename msbil moeduc_cat

rename vsbil faeduc_cat

*label parental education variables*

lab var moeduc_cat "Education level of mother"

lab var faeduc_cat "Education level of father"

lab def educ_cat 1 "Secondary School Degree" 2 "Intermediate School Degree" 3 "Technical School Degree" 4 "Upper Secondary School Degree" 5 "Other Degree" 6 "No School Degree" 7 "School Not Attended" 8 "Compulsory School" 9 "Continued School"
lab val moeduc_cat educ_cat
lab val faeduc_cat educ_cat

*Rename parental training variables*

rename mbbil motrain_cat

rename vbbil fatrain_cat

*label parental education variables*

lab var motrain_cat "Vocational training of mother"

lab var fatrain_cat "Vocational training of father"

lab def train_cat 10 "No Vocational Degree" 20 "Vocational Degree" 21 "Trained in Foreign Company" 22 "Trained long Time in Forein Country" 23 "Foreign Vocational School" 24 "Trade,Farming Apprentice" ///
25 "Business Apprentice" 26 "Health Care School" 27 "Special Technical School" 28 "Civil Service Training" 30 "Tech Engineer School" 31 "Foreign Collage" 32 "College, University" 40 "Other Training" 50 "Currently in Vocational Training" 51 "Currently in Schooling"
lab val motrain_cat train_cat
lab val fatrain_cat train_cat

*remove missings*

replace moeduc_cat = . if moeduc_cat < 1
replace faeduc_cat = . if faeduc_cat < 1
replace motrain_cat = . if motrain_cat < 1
replace fatrain_cat = . if fatrain_cat < 1

***SIBLINGS**

**gen new brother and sister variables**
ge nsis = nums
ge nbro = numb

*remove missing values and create value for zero sisters or brothers*

replace nsis = . if nums < 0
replace nsis = 0 if nums == -2 & geschw == 1
replace nsis = 0 if geschw == 2
replace nbro = . if numb < 0
replace nbro = 0 if numb == -2 & geschw == 1
replace nbro = 0 if geschw == 2
tab nsis
tab nbro

lab var nsis "Number of sisters"

lab var nbro "Number of brothers"

drop nums numb geschw

*create sibling variable*

ge nsibs = nsis + nbro

lab var nsib "Number of siblings"

**PARENTAL OCCUPATION**

rename misco88 moocc_isco

rename visco88 faocc_isco

lab var moocc_isco "Mother's Occupation - ISCO 88"

lab var faocc_isco "Father's Occupation - ISCO 88"

rename misei mo_isei
rename visei fa_isei

lab var mo_isei "Mother's Occupational Status - ISEI88"

lab var fa_isei "Father's Occupational Status - ISEI88"

**SORT BY PID FOR FUTURE MERGE**

sort pid

**SAVE TEMPORARY SMALL FILE**

save /*insert you work directory here*/, replace

*************************

*TAKE VARIABLES FROM PGEN LONG FILE*

use /*read your data here*/

keep pid syear pgisco88 pgisced97 pgisced11 pgemplst

**EDUCATION**

*Rename education variables*

rename pgisced11 educ_isced11

rename pgisced97 educ_isced97

*label parental education variables*

lab var educ_isced11 "Education ISCED 2011"

lab var educ_isced11 "Education ISCED 1997"

*remove missing*

replace educ_isced11 = . if educ_isced11 < 0

replace educ_isced97 = . if educ_isced97 < 0

**EMPLOYMENT STATUS**

rename pgemplst empstat

lab var empstat "Employment Status"

lab def empstat 1 "Full-Time Employment" 2 "Regular Part-Time" 3 "Vocational Training" 4 "Marginal, Irregular Part-Time" 5 "Not Employed" 6 "Sheltered workshop"
lab val empstat empstat

**remove missing**

replace empstat = . if empstat < 0

**OCCUPATION**

rename pgisco88 occ_isco

lab var occ_isco "Respondent Occupation ISCO 88"

**SORT BY PID AND YEAR FOR FUTURE MERGE**

sort pid syear

**SAVE TEMPORARY SMALL FILE**

save /*insert you work directory here*/, replace

*****************

*TAKE VARIABLES FROM PEQUIV LONG FILE*

use /*read your data here*/

keep pid syear d11101 d11102ll d11109 l11102 x11104ll

**GENDER**

rename d11102ll sex

*missing*

replace sex = . if sex < 0

**AGE and BIRTHYR**

rename d11101 age

*missing*

replace age = . if age < 0

**Birthyr**

ge birthyr = syear - age

lab var birthyr "Respondent's birth year"

**EDUC YEARS**

rename d11109 educ_yrs

*missing*

replace educ_yrs = . if educ_yrs < 0

lab var educ_yrs "Respondent's years of education"

**REGION**

rename l11102 region

lab var region "West and East Germany Regional Indicator"

**SAMPLE INDENTIFIER - WHICH SUBSAMPLE ARE INDIVIDUALS DRAWN FROM**

rename x11104ll subsample

**SORT BY PID AND YEAR FOR FUTURE MERGE**

sort pid syear

**SAVE TEMPORARY SMALL FILE**

save /*insert you work directory here*/, replace

*************

*MERGE PGEN AND PEQUIV FILES*

use /*read your data here*/

merge 1:1 pid syear using /*read your data here*/

*drop unmatched variables*

drop if _merge != 3 

drop _merge

**SAVE TEMPORARY SMALL FILE**

save /*insert you work directory here*/, replace

*MERGE PGEN/PEQUIV and BIOPAREN FILES*

use /*read your data here*/

merge m:1 pid using /*read your data here*/

*drop unmatched variables*

drop if _merge != 3 

drop _merge

**SAVE TEMPORARY SMALL FILE**

save /*insert you work directory here*/, replace


*************

*FORMAT NEW MASTER FILE*

use /*read your data here*/

** Constructing year and country variables **

rename syear year
lab var year "survey year"

ge country=276
lab var country "ISO country code"
//GERMANY: 276 (see "ISO Country Codes.pdf) 

**Creating unified ISCED Education coding for respondents and parents**

*respondent education*
ge educ_ISCED = .
replace educ_ISCED = 100 if educ_isced11 == 1
replace educ_ISCED = 200 if educ_isced11 == 2
replace educ_ISCED = 300 if educ_isced11 == 3
replace educ_ISCED = 400 if educ_isced11 == 4
replace educ_ISCED = 500 if educ_isced11 == 5
replace educ_ISCED = 600 if educ_isced11 == 6
replace educ_ISCED = 700 if educ_isced11 == 7
replace educ_ISCED = 800 if educ_isced11 == 8

lab var educ_ISCED "Respondent's Education - ISCED 2011"

**Parental Education**

rename maeduc_cat moeduc_cat
ge maeduc_ISCED = .
replace maeduc_ISCED = 244 if moeduc_cat == 1
replace maeduc_ISCED = 244 if moeduc_cat == 2
replace maeduc_ISCED = 344 if moeduc_cat == 3
replace maeduc_ISCED = 344 if moeduc_cat == 4
replace maeduc_ISCED = 100 if moeduc_cat == 7
replace maeduc_ISCED = 244 if moeduc_cat == 8
replace maeduc_ISCED = 344 if moeduc_cat == 9
replace maeduc_ISCED = 354 if motrain_cat > 19 & motrain_cat < 27
replace maeduc_ISCED = 554 if motrain_cat == 27
replace maeduc_ISCED = 746 if motrain_cat > 30 & motrain_cat <= 32
lab var maeduc_ISCED "Mother's Education - ISCED 2011"
tab maeduc_ISCED


ge faeduc_ISCED = .
replace faeduc_ISCED = 244 if faeduc_cat == 1
replace faeduc_ISCED = 244 if faeduc_cat == 2
replace faeduc_ISCED = 344 if faeduc_cat == 3
replace faeduc_ISCED = 344 if faeduc_cat == 4
replace faeduc_ISCED = 100 if faeduc_cat == 7
replace faeduc_ISCED = 244 if faeduc_cat == 8
replace faeduc_ISCED = 344 if faeduc_cat == 9
replace faeduc_ISCED = 354 if fatrain_cat > 19 & fatrain_cat < 27
replace faeduc_ISCED = 554 if fatrain_cat == 27
replace faeduc_ISCED = 746 if fatrain_cat > 30 & fatrain_cat <= 32
lab var faeduc_ISCED "Father's Education - ISCED 2011"
tab faeduc_ISCED


ge maeduc_yrs = . 
replace maeduc_yrs = 4 if maeduc_ISCED == 100
replace maeduc_yrs = 11.5 if maeduc_ISCED == 244
replace maeduc_yrs = 13 if maeduc_ISCED == 344
replace maeduc_yrs = 13.5 if maeduc_ISCED == 354
replace maeduc_yrs = 14.5 if maeduc_ISCED == 554
replace maeduc_yrs = 17.67 if maeduc_ISCED == 746
lab var maeduc_yrs "Mother's years of education"


ge faeduc_yrs = . 
replace faeduc_yrs = 4 if faeduc_ISCED == 100
replace faeduc_yrs = 11.5 if faeduc_ISCED == 244
replace faeduc_yrs = 13 if faeduc_ISCED == 344
replace faeduc_yrs = 13.5 if faeduc_ISCED == 354
replace faeduc_yrs = 14.5 if faeduc_ISCED == 554
replace faeduc_yrs = 17.67 if faeduc_ISCED == 746

lab var faeduc_yrs "Father's years of education"



**SAVE TEMPORARY SMALL FILE**

save /*insert you work directory here*/, replace

*************

*CREATE SEPARATE FILES FOR EAST AND WEST GERMANY*

*Create West Germany File*

use /*read your data here*/

keep if region == 1

replace country = 280 if country == 276

**West Germany ISO Country Code**

save /*insert you work directory here*/, replace

*Create East Germany File*

use /*read your data here*/

keep if region == 2

replace country = 278 if country == 276

**East Germany ISO Country Code**

save /*insert you work directory here*/, replace

*************

** CONVERT FILE FROM PERSON-PERIOD TO REPEATED CROSS-SECTION - KEEP ONLY RANDOM OBSERVATION FOR EACH INDIVIDUAL**

**WEST GERMANY**

use /*read your data here*/

*Take only one observation for each individual - select only wave of observation where biography information was collected*

**generate identifier for bioyear each unique pid*

ge bioyear_flag = .
replace bioyear_flag = 1 if bioyear == year
replace bioyear_flag = 0 if bioyear != year
tab bioyear_flag

**keep only bioyear observation for each individual**
keep if bioyear_flag == 1

*Take only one observation for each individual - select wave of observation randomly to provide equal distribution among years*

**generate identifier for a random wave for each unique pid*

*generate random = runiform()
*sort random*
*egen pid_random = tag(pid)*
*sort pid*

**keep only randomly chosen wave for each individual**
*keep if pid_random == 1*

*drop random pid_random*

**Save new Repeated Cross Section master file **

save /*insert you work directory here*/, replace

**EAST GERMANY**

use /*read your data here*/

*Take only one observation for each individual - select only wave of observation where biography information was collected*

**generate identifier for bioyear each unique pid*

ge bioyear_flag = .
replace bioyear_flag = 1 if bioyear == year
replace bioyear_flag = 0 if bioyear != year
tab bioyear_flag

**keep only bioyear observation for each individual**
keep if bioyear_flag == 1

*Take only one observation for each individual - select wave of observation randomly to provide equal distribution among years*

**generate identifier for a random wave for each unique pid*

*generate random = runiform()
*sort random*
*egen pid_random = tag(pid)*
*sort pid*

**keep only randomly chosen wave for each individual**
*keep if pid_random == 1*

*drop random pid_random*


**Save new Repeated Cross Section master file **

save /*insert you work directory here*/, replace

*************

*CREATE SEPARATE FILES BY YEAR FOR EAST AND WEST GERMANY*

**WEST GERMANY**
cd /*insert you work directory here*/
 use /*read your data here*/ 
  preserve 
  foreach i of num 1984/2015 {
          keep if year == `i'
          save WDEU`i'
          restore, preserve 
  }
  
**EAST GERMANY**
cd /*insert you work directory here*/
 use /*read your data here*/ 
  preserve 
  foreach i of num 1990/2015 {
          keep if year == `i'
          save EDEU`i'
          restore, preserve 
  }

  
